{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dc9b596e-5bd9-4be8-8dff-749f1f3b4d89",
   "metadata": {},
   "source": [
    "# 模型微调(英译中翻译)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "562901d7-f9eb-4449-a638-ad05fd8c516e",
   "metadata": {},
   "source": [
    "### 1 数据预览"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b966c5af-6754-408b-9a0a-949939fcbf8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -5 News-Commentary-v16-1000.en-zh"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24b805d5-ea63-49e3-8c2b-b772a6cecd21",
   "metadata": {},
   "source": [
    "### 2 数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c8b3e9d3-7047-42be-b4cc-3bc48e42d043",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "84006d49-d727-4f49-a760-ef03c876120c",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt='\"{txt}\"\\n请将上面的句子翻译成中文(仅输出翻译结果):\\n'\n",
    "with open('News-Commentary-v16-1000.en-zh','r') as f:\n",
    "    with open('sample.jsonl','w') as g:\n",
    "        for i,line in enumerate(f):\n",
    "            en,zh=line.strip().split('\\t')\n",
    "            s= [{'user':prompt.format(txt=en)},\n",
    "                {'assistant':zh}]\n",
    "            g.write(json.dumps(s,ensure_ascii=False)+'\\n')\n",
    "        for _ in range(5):\n",
    "            g.write(json.dumps([{'user':'你是谁？'}, {'assistant':'我是工银智涌大模型。'}],ensure_ascii=False)+'\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5402fe21-2ae3-48b8-a776-2bcd98f8b246",
   "metadata": {},
   "source": [
    "### 3 数据转换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5be00088-d4d5-4f1e-9c89-74dab315321d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m jllm.raw2ids \\\n",
    "    --tokenizer Qwen2.5-7B-Instruct \\\n",
    "    -i sample.jsonl \\\n",
    "    --max_len 8193 -C --filter"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36eeb743-e622-4563-9597-3f9965ff9635",
   "metadata": {},
   "source": [
    "### 3.1 数据检查"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "be35198f-a346-4915-8110-e53b6103697f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "tokenizer=AutoTokenizer.from_pretrained('Qwen2.5-7B-Instruct')\n",
    "import pyarrow.parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ab54f8f1-4dc1-448b-8eaf-cb90c12064d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = \"Give me a short introduction to large language model.\"\n",
    "messages = [\n",
    "    {\"role\": \"system\", \"content\": \"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\"},\n",
    "    {\"role\": \"user\", \"content\": prompt}\n",
    "]\n",
    "text = tokenizer.apply_chat_template(\n",
    "    messages,\n",
    "    tokenize=False,\n",
    "    add_generation_prompt=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "253a903b-8503-4e14-8204-be1e95d7443f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|im_start|>system\n",
      "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n",
      "<|im_start|>user\n",
      "Give me a short introduction to large language model.<|im_end|>\n",
      "<|im_start|>assistant\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d8802b0e-abf3-43bb-a27d-3438e289e6a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pyarrow.parquet.read_table('sample_Qwen2.5-7B-Instruct/sample-00000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0caaaa71-b6d3-4a52-8742-a3857a423d84",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids=data['input_ids'].to_numpy().tolist()\n",
    "labels=[i.copy() for i in data['labels'].to_numpy().tolist()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "eaa2bbb1-a774-437b-aa06-74f95df5b7f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx=-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9482efe9-d97b-49a5-868d-018c6a7ad2e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels[idx][0]=-100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2586ef9e-9680-4ee9-947a-20798433ae87",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(tokenizer.decode(input_ids[idx]));\n",
    "labels[idx][labels[idx]==-100]=9\n",
    "print(tokenizer.decode(labels[idx]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df2a40cc-c6a5-409b-b0a9-5afba39c1b1a",
   "metadata": {},
   "source": [
    "## 4 模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ae4e2ce-afd6-4bff-9122-084d140a4950",
   "metadata": {},
   "source": [
    "#### 4.1全参微调"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02853282-09e4-4587-aaa7-68a09c430ab9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!deepspeed --module jllm.train_pipe \\\n",
    "    --model pretrained_hf \\\n",
    "    --num_train_epochs 2 \\\n",
    "    --train_data sample_Qwen2.5-7B-Instruct \\\n",
    "    --pipe_parallel_size 4 \\\n",
    "    --model_parallel_size 1 \\\n",
    "    --sequence_parallel_size 1 \\\n",
    "    --per_device_train_batch_size 1 \\\n",
    "    --global_batch_size 16 \\\n",
    "    --partition_method fast \\\n",
    "    --split_dlayer \\\n",
    "    --only_ckpt_model \\\n",
    "    --max_num_checkpoints 2 \\\n",
    "    --split_dlayer \\\n",
    "    --learning_rate 1e-5 |tee finetuning.log\n",
    "\n",
    "# 注释：\n",
    "# --model 模型路径至少需要包含config.json\n",
    "# --num_train_epochs 训练轮数\n",
    "# --train_data 训练数据\n",
    "# --pipe_parallel_size 流水线并行个数\n",
    "# --model_parallel_size 张量并行个数\n",
    "# --per_device_train_batch_size 一次输入训练多少样本\n",
    "# --global_batch_size 全局训练完多少样本后（累加完多少个梯度后）进行一次参数更新\n",
    "# --partition_method fast 流水线拆分策略\n",
    "# --only_ckpt_model 只check模型参数，此时会直接存成huggingface格式\n",
    "# --checkpoint checkpoint 模型检查点目录\n",
    "# --output_dir pretrained 最终模型输出目录\n",
    "# --max_num_checkpoints 2 最大保留多少个检查点\n",
    "# --split_dlayer 是否拆分docoder layer\n",
    "# --learning_rate 1e-5 学习率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e87f593f-99d9-4d1e-a06a-769da2c35085",
   "metadata": {},
   "outputs": [],
   "source": [
    "!grep  'steps:.*loss:' finetuning.log|awk '{print $2,$4}'>loss\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "xy=np.loadtxt('loss')  \n",
    "plt.plot(xy[:,0], xy[:,1])  \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "819336d8-dc0d-42ca-b347-eb3d947004d1",
   "metadata": {},
   "source": [
    "#### 4.2 LORA微调"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58818709-18f3-4e42-9a06-ccecadf1aee1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!deepspeed --module jllm.train_pipe \\\n",
    "    --model pretrained_hf \\\n",
    "    --num_train_epochs 3 \\\n",
    "    --train_data sample_Qwen2.5-7B-Instruct \\\n",
    "    --pipe_parallel_size 4 \\\n",
    "    --model_parallel_size 1 \\\n",
    "    --per_device_train_batch_size 1 \\\n",
    "    --global_batch_size 32 \\\n",
    "    --partition_method fast \\\n",
    "    --split_dlayer \\\n",
    "    --only_ckpt_lora \\\n",
    "    --checkpoint checkpoint_lora \\\n",
    "    --max_num_checkpoints 2 \\\n",
    "    --skip_epoch 1,2 \\\n",
    "    --split_dlayer \\\n",
    "    --lora_dim 32 \\\n",
    "    --lora_alpha 32 \\\n",
    "    --lora_module_name 'qkv_proj,o_proj,gate_up_proj,down_proj' \\\n",
    "    --only_optimize_lora \\\n",
    "    --learning_rate 1e-5 |tee lora.log\n",
    "\n",
    "#注释：\n",
    "# --model 模型路径至少需要包含config.json\n",
    "# --num_train_epochs 训练轮数\n",
    "# --train_data 训练数据\n",
    "# --pipe_parallel_size 流水线并行个数\n",
    "# --model_parallel_size 张量并行个数\n",
    "# --per_device_train_batch_size 一次输入训练多少样本\n",
    "# --global_batch_size 训练完多少样本后（累加完多少个梯度后）进行一次参数更新\n",
    "# --partition_method 流水线拆分策略\n",
    "# --only_ckpt_model 只check模型参数，此时会直接存成huggingface格式\n",
    "# --checkpoint checkpoint 模型检查点目录\n",
    "# --output_dir pretrained 最终模型输出目录\n",
    "# --max_num_checkpoints 2 最大保留多少个检查点\n",
    "# --skip_epoch 跳过的检查点\n",
    "# --split_dlayer 是否拆分docoder layer\n",
    "# --lora_dim lora参数的秩\n",
    "# --lora_alpha lora参数的权重\n",
    "# --lora_module_name 对哪些线性层执行lora替换\n",
    "# --only_optimize_lora 只对被lora替换的参数进行梯度更新\n",
    "# --learning_rate 1e-5 学习率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67a9f80e-a498-44cb-9cde-1e52ad24b354",
   "metadata": {},
   "outputs": [],
   "source": [
    "!grep  'steps:.*loss:' lora.log|awk '{print $2,$4}'>loss\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "xy=np.loadtxt('loss')  \n",
    "plt.plot(xy[:,0], xy[:,1])  \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c6b4798d-8c1e-435c-b157-db5cd2c71ec2",
   "metadata": {},
   "source": [
    "## 5 推理测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f7884666-eb10-4929-9deb-132e8ba15271",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch_npu\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "tokenizer=AutoTokenizer.from_pretrained('Qwen2.5-7B-Instruct')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fd49ab6-0f49-4a15-8a78-3fa835305cff",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    'checkpoint/31',\n",
    "    torch_dtype=\"auto\",\n",
    "    device_map=\"auto\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d1886d7e-3099-40a7-ae44-bc528f1dcad5",
   "metadata": {},
   "outputs": [],
   "source": [
    "en='PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.'\n",
    "prompt = f'\"{en}\"\\n请将上面的句子翻译成中文(仅输出翻译结果):\\n'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0cce277b-0121-41a8-9e99-6e177fb1528d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def infer(prompt):\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": \"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\"},\n",
    "        {\"role\": \"user\", \"content\": prompt}\n",
    "    ]\n",
    "    text = tokenizer.apply_chat_template(\n",
    "        messages,\n",
    "        tokenize=False,\n",
    "        add_generation_prompt=True\n",
    "    )\n",
    "    model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "    \n",
    "    generated_ids = model.generate(\n",
    "        **model_inputs,\n",
    "        max_new_tokens=128\n",
    "    )\n",
    "    generated_ids = [\n",
    "        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "    ]\n",
    "    \n",
    "    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07407ca7-baa4-490b-8048-dd8314428b34",
   "metadata": {},
   "outputs": [],
   "source": [
    "infer(prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6b035be-e6a1-4289-84ee-be0efc9dc3a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "infer('你是谁？')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
